import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
#run pip install mlxtend in your terminal to install mlxtend
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
import statsmodels.api as sm
import statsmodels.formula.api as smf
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
This study aims to investigate the relationship between the concentration of PM2.5 and atmospheric pollutants and the influence of meteorological conditions on PM2.5 concentration. Mathematical statistics methods were employed to analyze the data of PM2.5, atomspheric pollutants and meteorology in Changping ,Dongsi and Huairou District of Beijing, from March 2013 to February 2017. the bivariate correlation analysis showed that the PM2.5 concentration had a strong correlation with the concentrations of atmospheric pollutants such as PM10,SO2,NO2,O3,CO and the meteorological conditions such as PRES, WAPM. The fitting models we used OLS regression and feature Sequential Feature Selection forward method and gbm method for each season. After compared the value of R squared, we decided the model of PM2.5 concentration for each season use gbm method. --Mary
# Import the three data sets and concatenate them
changping = pd.read_csv('/Users/Rocco/UCLA/Stats 131/Final Project/PRSA_Data_20130301-20170228/PRSA_Data_Changping_20130301-20170228.csv')
changping = changping.iloc[:,1:]
dongsi = pd.read_csv('/Users/Rocco/UCLA/Stats 131/Final Project/PRSA_Data_20130301-20170228/PRSA_Data_Dongsi_20130301-20170228.csv')
dongsi = dongsi.iloc[:,1:]
huairou = pd.read_csv('/Users/Rocco/UCLA/Stats 131/Final Project/PRSA_Data_20130301-20170228/PRSA_Data_Huairou_20130301-20170228.csv')
huairou = huairou.iloc[:,1:]
data = pd.concat([changping, dongsi, huairou], axis = 0)
# Create a "date" column of the form of YYYY-MM-DD HH:MM:SS"
date = pd.to_datetime({"year":data.year,"month":data.month,"day":data.day,"hour":data.hour})
data['date'] = date
# Reindex
data = data.reset_index(drop=True)
data.head()
data.tail()
# to see if has na values
data.info()
# export the merged csv file (before removing na) for future use
data.to_csv("data(hasna).csv",index = False)
# create a list that contains the column name where has na
nanlistname = ['PM2.5','PM10','SO2','NO2','CO','O3','TEMP','PRES','DEWP','RAIN','WSPM']
# define a function that cleans na
# input a column and replace na with the mean of the its two nearest neighbour
def fillna(columns):
for c in columns:
a = c.notnull()
for i in range(0, data.shape[0]):
#print(a[i])
if a[i]==False:
for k in range(i,data.shape[0]):
if a[k]==True:
c[i] = (c[i-1] + c[k])/2
break
# fill na for 'wd' using previous non-na value
data['wd'].fillna(method = 'ffill',inplace = True )
# fill na for the other columns that contain na
fillna([data['SO2'],data['PM2.5'],data['PM10'],data['CO'],data['NO2'],data['O3'],data['DEWP'],data['TEMP'],data['PRES'],data['WSPM'],data['RAIN']])
# check if na's have been filled up
data.info()
# export the non na csv file
data.to_csv("data(nona).csv",index = False)
Now we have done the imputation of Missing Data. Starting from here, we will import data(nona).csv file everytime we launch the notebook (to save time)!
# Import the no-na data file
data = pd.read_csv('/Users/Rocco/UCLA/Stats 131/Final Project/data(nona).csv')
wd = data.wd
data.head()
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
wd_num = {
'E':1,
'ENE':2,
'ESE':3,
'N':4,
'NE':5,
'NNE':6,
'NNW':7,
'NW':8,
'S':9,
'SE' :10,
'SSE':11,
'SSW':12,
'SW':13,
'W':14,
'WNW':15,
'WSW':16,
}
wd_encoded=np.array([[wd_num.get(value)] for value in data.wd])
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)
wd_encoded = wd_encoded.reshape(len(wd_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(wd_encoded)
encoded_wd = pd.DataFrame.from_records(onehot_encoded, columns=wd_num.keys())
encoded_wd
data=pd.concat([data.drop(['wd'], axis=1), encoded_wd, wd], axis=1)
# Set a multi index
data = data.set_index(['station','date'])
data.head()
data.tail()
data.info()
# check the dimension of the data sets for each station
print(data.loc['Changping'].shape)
print(data.loc['Dongsi'].shape)
print(data.loc['Huairou'].shape)
# select via the ounner then inner index
# data.loc[('Huairou', '2017-01-01 00:00:00'):('Huairou', '2017-01-31 23:00:00')]
# define a dictionary for season
s = {
1: 'Winter',
2: 'Winter',
3: 'Spring',
4: 'Spring',
5: 'Spring',
6: 'Summer',
7: 'Summer',
8: 'Summer',
9: 'Fall',
10: 'Fall',
11: 'Fall',
12: 'Winter'}
# add a 'season' column that indicates season
data["season"]=[s.get(value) for value in data.month]
data.head()
data.tail()
Since Winter spans two years, we need to define a "seasonal year" column that indicates correct corresponding year for each season.
Note our data is from 2013-03-01 to 2017-02-28, and we have Spring (2013, 2014, 2015, 2016), Summer (2013, 2014, 2015, 2016), Fall (2013, 2014, 2015, 2016), and Winter (2013, 2014, 2015, 2016). Thus, we will consider the date from 2013-12 to 2014-02 as 2013 winter, etc.
# concatenate year and month columns
yearmonth = data[['year', 'month']].astype(str).apply(lambda x: ''.join(x), axis=1).astype(int)
yearmonth
# define a dictionary for seasonal year
seasonal_year = {
20133: 2013, 20143: 2014, 20153: 2015, 20163: 2016,
20134: 2013, 20144: 2014, 20154: 2015, 20164: 2016,
20135: 2013, 20145: 2014, 20155: 2015, 20165: 2016,
20136: 2013, 20146: 2014, 20156: 2015, 20166: 2016,
20137: 2013, 20147: 2014, 20157: 2015, 20167: 2016,
20138: 2013, 20148: 2014, 20158: 2015, 20167: 2016,
20139: 2013, 20149: 2014, 20159: 2015, 20169: 2016,
201310: 2013, 201410: 2014, 201510: 2015, 201610: 2016,
201311: 2013, 201411: 2014, 201511: 2015, 201611: 2016,
201312: 2013, 201412: 2014, 201512: 2015, 201612: 2016,
20141: 2013, 20151: 2014, 20161: 2015, 20171: 2016,
20142: 2013, 20152: 2014, 20162: 2015, 20172: 2016
}
data["seasonal year"]=[seasonal_year.get(value) for value in yearmonth]
data.head()
data.tail()
So far, we have done data cleaning and manuplication.
# Find seasonal mean of each variable for each station in each year
seasonmean = data.iloc[:,4:].groupby(["station","seasonal year","season"], sort=False).mean().unstack()
seasonmean
# plot the chage in PM2.5
seasonmean[["PM2.5"]].groupby("station").plot()
As we can see from above graphs, in each station, overall, Winter PM2.5 level is higher than other seasons except for year 2014, and Summer PM2.5 level is always lower than other seasons. Furthermore, the level of PM 2.5 in each station has an overall dereasing trend in the first 3 years but rebounds up a little bit in 2016.
Let's look at changes in other pollutants.
seasonmean[['PM10']].groupby("station").plot()
Surprisingly, for PM10, Spring is the worst. This might be due to pollination in spring. Since we are more concerned about the air pollution from fine inhalable particles (PM2.5) and PM10 is confounded with PM2.5, thus we will disregard PM10 from now on.
seasonmean[['SO2']].groupby("station").plot()
seasonmean[['NO2']].groupby("station").plot()
seasonmean[['CO']].groupby("station").plot()
For SO2, NO2 and CO, the seasonal trends match that of PM2.5, that is, overall, Winter has high level of pollutants than any other seasons and Summer has low level of pollutants than other seasons. Also notice that, overall, SO2 is decreasing over year
seasonmean[['O3']].groupby("station").plot()
For O3, the results are just reversed, which makes sense since ozone has the highest concentration in summer and lowest in winter.
seasonmean[['TEMP']].groupby("station").plot()
We can see from above plots, the temp trends to seasonal, summer has high temp and winter has low temp in Beijing. temp has negative relationship with PM2.5 How can you tell? .
seasonmean[['PRES']].groupby("station").plot()
pressure plots show that summer is low and winter is high in Beijing. And Dongsi has overall high pressure than the other two regions.
seasonmean[['DEWP']].groupby("station").plot()
DEWP is high in summer and low in winter in Beijing. For each station, DEWP has similar trend in each season which indicates that DEWP is consistent in Beijing.
seasonmean[['RAIN']].groupby("station").plot()
We can see that RAIN is high in summer and low in winter in Beijing, but we can see that the rain in summer in 2014 is much lower than any others in summer.
seasonmean[['WSPM']].groupby("station").plot()
WSPM varies over year but we can see that overall, Spring and Fall have stronger wind compared to Summer and Winter.
Since we are only interested in seasonal change, we use the seaonal mean of each pollutant to analyze the data.
stationmean = data.iloc[:,4:].groupby(["seasonal year","season","station"], sort=False).mean().unstack()
stationmean
stationmean[['PM2.5']].plot()
As we can see, Dongsi has much higer level of PM2.5 than the other two stations. This might be due to the fact that Dongsi is located at the center of Beijing, where has more cars and more pollutants.
stationmean[['SO2']].plot()
stationmean[['NO2']].plot()
stationmean[['CO']].plot()
These graphs verifies our guess that Dongsi has more pollutants than the other two due to its geographical location.
yearmean=data.groupby(["station","seasonal year"]).mean()[["PM2.5"]].unstack()
yearmean.plot(kind='bar')
plt.legend(loc='upper center', bbox_to_anchor=(1.2, 1))
plt.ylabel('PM2.5')
yearmean=data.groupby(["station","seasonal year"]).mean()[["SO2"]].unstack()
yearmean.plot(kind='bar')
plt.legend(loc='upper center', bbox_to_anchor=(1.2, 1))
plt.ylabel('SO2')
yearmean=data.groupby(["station","seasonal year"]).mean()[["NO2"]].unstack()
yearmean.plot(kind='bar')
plt.legend(loc='upper center', bbox_to_anchor=(1.2, 1))
plt.ylabel('NO2')
yearmean=data.groupby(["station","seasonal year"]).mean()[["CO"]].unstack()
yearmean.plot(kind='bar')
plt.legend(loc='upper center', bbox_to_anchor=(1.2, 1))
plt.ylabel('CO')
As the bar plots show, PM2.5 level as well as other pollutant, overall, do decreae in year 2013 to 2015 but rebound up in 2016. (notes these are seasonal year)
First, let's seperate data into four seasons and begin by looking at the correlation plots for each season.
spring = data.loc[data.season=="Spring"].rename(columns={'PM2.5':'PM2_5'})
summer = data.loc[data.season=="Summer"].rename(columns={'PM2.5':'PM2_5'})
fall = data.loc[data.season=="Fall"].rename(columns={'PM2.5':'PM2_5'})
winter = data.loc[data.season=="Winter"].rename(columns={'PM2.5':'PM2_5'})
sns.pairplot below if your computer is slow)¶# pairplot for spring
sns.pairplot(spring.iloc[:,4:15].reset_index(level='station'), hue="station")
# pairplot for summer
sns.pairplot(summer.iloc[:,4:15].reset_index(level='station'), hue="station")
# pairplot for fall
sns.pairplot(fall.iloc[:,4:15].reset_index(level='station'), hue="station")
# pairplot for winter
sns.pairplot(winter.iloc[:,4:15].reset_index(level='station'), hue="station")
Need some brief explaination here
Note that there're a lot of scattered dots in "vertical" or "horizontal" pattern, this is due to missiing data we filled in.
corr_by_sta_seas = data.drop(['year','month','day','hour','seasonal year'], axis=1).groupby(["station","season"]).corr()
corr_by_sta_seas.loc['Changping'][['PM2.5']].unstack(0).iloc[1:11,:]
corr_by_sta_seas.loc['Dongsi'][['PM2.5']].unstack(0).iloc[1:11,:]
corr_by_sta_seas.loc['Huairou'][['PM2.5']].unstack(0).iloc[1:11,:]
In each station during each season...
PM10 is always highly correlated to PM2.5;
SO2 is relatively highly correlated to PM2.5 in Spring and Winter but weakly correlated in Summer and Fall;
NO2 is highly correlated to PM2.5 in Spring, Fall,and Winter, but weakly correlated in Summer;
CO is highly correlated to PM2.5 in Spring, Fall,and Winter, but relatively weakly correlated in Summer;
O3 does not seem highly correlated to PM2.5 but does has somewhat negative correlation between PM2.5 in Winter;
Among the five meteorological conditions, only the dew point in the winter seems highly correlated to PM2.5, all the others, such as temperature, pressure, dew point, precipitation and wind speed do not seem correleated to PM2.5 in any season.
we begin to do OLS regression analysis and see what are statistically significant to the change in PM2.5. First, let's do forward feature selection to decide which features yield the best model in terms of $R^2$.
spring data for each station.¶# Changping, Spring:
X = spring.loc['Changping'].values[:,6:31] # note we are excluding PM10
y = spring.loc['Changping'].values[:,4].ravel()
lr = LinearRegression()
sfsfit = sfs(lr,
k_features=25,
forward=True,
floating=False,
verbose=0,
scoring='r2',
cv=5).fit(X, y)
print(pd.DataFrame.from_dict(sfsfit.get_metric_dict()).T.avg_score.sort_values(ascending=False))
fig = plot_sfs(sfsfit.get_metric_dict(), kind='std_err')
plt.title('SFS for Changping, Spring (w. StdErr)')
plt.grid()
plt.show()
# Dongsi, Spring:
X = spring.loc['Dongsi'].values[:,6:31] # note we are excluding PM10
y = spring.loc['Dongsi'].values[:,4].ravel()
lr = LinearRegression()
sfsfit = sfs(lr,
k_features=25,
forward=True,
floating=False,
verbose=0,
scoring='r2',
cv=5).fit(X, y)
print(pd.DataFrame.from_dict(sfsfit.get_metric_dict()).T.avg_score.sort_values(ascending=False))
fig = plot_sfs(sfsfit.get_metric_dict(), kind='std_err')
plt.title('SFS for Dongsi, Spring (w. StdErr)')
plt.grid()
plt.show()
# Huairou, Spring:
X = spring.loc['Huairou'].values[:,6:31] # note we are excluding PM10
y = spring.loc['Huairou'].values[:,4].ravel()
lr = LinearRegression()
sfsfit = sfs(lr,
k_features=25,
forward=True,
floating=False,
verbose=0,
scoring='r2',
cv=5).fit(X, y)
print(pd.DataFrame.from_dict(sfsfit.get_metric_dict()).T.avg_score.sort_values(ascending=False))
fig = plot_sfs(sfsfit.get_metric_dict(), kind='std_err')
plt.title('SFS for Huairou, Spring (w. StdErr)')
plt.grid()
plt.show()
summer data for each station.¶# Changping, Summer:
X = summer.loc['Changping'].values[:,6:31] # note we are excluding PM10
y = summer.loc['Changping'].values[:,4].ravel()
lr = LinearRegression()
sfsfit = sfs(lr,
k_features=25,
forward=True,
floating=False,
verbose=0,
scoring='r2',
cv=5).fit(X, y)
print(pd.DataFrame.from_dict(sfsfit.get_metric_dict()).T.avg_score.sort_values(ascending=False))
fig = plot_sfs(sfsfit.get_metric_dict(), kind='std_err')
plt.title('SFS for Changping, Summer (w. StdErr)')
plt.grid()
plt.show()
# Dongsi, Summer:
X = summer.loc['Dongsi'].values[:,6:31] # note we are excluding PM10
y = summer.loc['Dongsi'].values[:,4].ravel()
lr = LinearRegression()
sfsfit = sfs(lr,
k_features=25,
forward=True,
floating=False,
verbose=0,
scoring='r2',
cv=5).fit(X, y)
print(pd.DataFrame.from_dict(sfsfit.get_metric_dict()).T.avg_score.sort_values(ascending=False))
fig = plot_sfs(sfsfit.get_metric_dict(), kind='std_err')
plt.title('SFS for Dongsi, Summer (w. StdErr)')
plt.grid()
plt.show()
# Huairou, Summer:
X = summer.loc['Huairou'].values[:,6:31] # note we are excluding PM10
y = summer.loc['Huairou'].values[:,4].ravel()
lr = LinearRegression()
sfsfit = sfs(lr,
k_features=25,
forward=True,
floating=False,
verbose=0,
scoring='r2',
cv=5).fit(X, y)
print(pd.DataFrame.from_dict(sfsfit.get_metric_dict()).T.avg_score.sort_values(ascending=False))
fig = plot_sfs(sfsfit.get_metric_dict(), kind='std_err')
plt.title('SFS for Huairou, Summer (w. StdErr)')
plt.grid()
plt.show()
fall data for each station.¶# Changping, Fall:
X = fall.loc['Changping'].values[:,6:31] # note we are excluding PM10
y = fall.loc['Changping'].values[:,4].ravel()
lr = LinearRegression()
sfsfit = sfs(lr,
k_features=25,
forward=True,
floating=False,
verbose=0,
scoring='r2',
cv=5).fit(X, y)
print(pd.DataFrame.from_dict(sfsfit.get_metric_dict()).T.avg_score.sort_values(ascending=False))
fig = plot_sfs(sfsfit.get_metric_dict(), kind='std_err')
plt.title('SFS for Changping, Fall (w. StdErr)')
plt.grid()
plt.show()
# Dongsi, Fall:
X = fall.loc['Dongsi'].values[:,6:31] # note we are excluding PM10
y = fall.loc['Dongsi'].values[:,4].ravel()
lr = LinearRegression()
sfsfit = sfs(lr,
k_features=25,
forward=True,
floating=False,
verbose=0,
scoring='r2',
cv=5).fit(X, y)
print(pd.DataFrame.from_dict(sfsfit.get_metric_dict()).T.avg_score.sort_values(ascending=False))
fig = plot_sfs(sfsfit.get_metric_dict(), kind='std_err')
plt.title('SFS for Dongsi, Fall (w. StdErr)')
plt.grid()
plt.show()
# Huairou, Fall:
X = fall.loc['Huairou'].values[:,6:31] # note we are excluding PM10
y = fall.loc['Huairou'].values[:,4].ravel()
lr = LinearRegression()
sfsfit = sfs(lr,
k_features=25,
forward=True,
floating=False,
verbose=0,
scoring='r2',
cv=5).fit(X, y)
print(pd.DataFrame.from_dict(sfsfit.get_metric_dict()).T.avg_score.sort_values(ascending=False))
fig = plot_sfs(sfsfit.get_metric_dict(), kind='std_err')
plt.title('SFS for Huairou, Fall (w. StdErr)')
plt.grid()
plt.show()
winter data for each station.¶# Changping, Winter:
X = winter.loc['Changping'].values[:,6:31] # note we are excluding PM10
y = winter.loc['Changping'].values[:,4].ravel()
lr = LinearRegression()
sfsfit = sfs(lr,
k_features=25,
forward=True,
floating=False,
verbose=0,
scoring='r2',
cv=5).fit(X, y)
print(pd.DataFrame.from_dict(sfsfit.get_metric_dict()).T.avg_score.sort_values(ascending=False))
fig = plot_sfs(sfsfit.get_metric_dict(), kind='std_err')
plt.title('SFS for Changping, Winter (w. StdErr)')
plt.grid()
plt.show()
# Dongsi, Winter:
X = winter.loc['Dongsi'].values[:,6:31] # note we are excluding PM10
y = winter.loc['Dongsi'].values[:,4].ravel()
lr = LinearRegression()
sfsfit = sfs(lr,
k_features=25,
forward=True,
floating=False,
verbose=0,
scoring='r2',
cv=5).fit(X, y)
print(pd.DataFrame.from_dict(sfsfit.get_metric_dict()).T.avg_score.sort_values(ascending=False))
fig = plot_sfs(sfsfit.get_metric_dict(), kind='std_err')
plt.title('SFS for Dongsi, Winter (w. StdErr)')
plt.grid()
plt.show()
# Huairou, Winter:
X = winter.loc['Huairou'].values[:,6:31] # note we are excluding PM10
y = winter.loc['Huairou'].values[:,4].ravel()
lr = LinearRegression()
sfsfit = sfs(lr,
k_features=25,
forward=True,
floating=False,
verbose=0,
scoring='r2',
cv=5).fit(X, y)
print(pd.DataFrame.from_dict(sfsfit.get_metric_dict()).T.avg_score.sort_values(ascending=False))
fig = plot_sfs(sfsfit.get_metric_dict(), kind='std_err')
plt.title('SFS for Huairou, Winter (w. StdErr)')
plt.grid()
plt.show()
After performing Sequential Feature Selection, we see that for each season, the $R^2$ does not differ that much after around 5 features, thus, we will include SO2, NO2, CO, O3, TEMP, PRES, DEWP , RAIN, WSPM, and wd in the OLS regression model in order to gain more insight from all the variables
# Define regression formula
formula = 'PM2_5 ~ SO2 + NO2 + CO + O3 + TEMP + PRES + DEWP + RAIN + WSPM + wd - 1'
We will seperate our data into 12 groups, for each station and each season.
# seperate data into 3 regions:
changping = data.loc['Changping']
dongsi = data.loc['Dongsi']
huairou = data.loc['Huairou']
# break down further into 12 data sets
chp_spr = changping.loc[changping.season == 'Spring'].rename(columns={'PM2.5':'PM2_5'})
chp_su =changping.loc[changping.season == 'Summer'].rename(columns={'PM2.5':'PM2_5'})
chp_fa = changping.loc[changping.season == 'Fall'].rename(columns={'PM2.5':'PM2_5'})
chp_win = changping.loc[changping.season == 'Winter'].rename(columns={'PM2.5':'PM2_5'})
ds_spr = dongsi.loc[dongsi.season == 'Spring'].rename(columns={'PM2.5':'PM2_5'})
ds_su =dongsi.loc[dongsi.season == 'Summer'].rename(columns={'PM2.5':'PM2_5'})
ds_fa = dongsi.loc[dongsi.season == 'Fall'].rename(columns={'PM2.5':'PM2_5'})
ds_win = dongsi.loc[dongsi.season == 'Winter'].rename(columns={'PM2.5':'PM2_5'})
hr_spr = huairou.loc[huairou.season == 'Spring'].rename(columns={'PM2.5':'PM2_5'})
hr_su =huairou.loc[huairou.season == 'Summer'].rename(columns={'PM2.5':'PM2_5'})
hr_fa = huairou.loc[huairou.season == 'Fall'].rename(columns={'PM2.5':'PM2_5'})
hr_win = huairou.loc[huairou.season == 'Winter'].rename(columns={'PM2.5':'PM2_5'})
oslm_chp_spr = smf.ols(formula=formula, data=chp_spr.iloc[:,4:32]).fit().summary()
oslm_chp_su = smf.ols(formula=formula, data=chp_su.iloc[:,4:32]).fit().summary()
oslm_chp_fa = smf.ols(formula=formula, data=chp_fa.iloc[:,4:32]).fit().summary()
oslm_chp_win = smf.ols(formula=formula, data=chp_win.iloc[:,4:32]).fit().summary()
print(oslm_chp_spr) #Changping, Spring
print(oslm_chp_su) #Changping, Summer
print(oslm_chp_fa) #Changping, Fall
print(oslm_chp_win) #Changping, Winter
From the OLS Regression Results of Changping we can see that winter has highest R-squared which is 0.812, R- squared in summer is 0.487 which is lowest, because PM2.5 is higher in winter than summer.
For spring of Changping, the p values of wd are large than 0.05, so wd is not significant, and pressure also is not significant because it's p value large than 0.05. The coefficient of Rain and Temp are negative.
For summer of Changping, all of the varibles p values are smaller than 0.05, so all variables are significant. However, the coefficients of wd are large positive number, so wd has importance effect in summer, especially wd(E). Temp, Rain and pres have negative coefficients.
For Fall of Changping, all of the varibles p values are smaller than 0.05, so all variables are significant. However, wd has large negative coefficient, so wd increase will decrease PM2.5. Rain and Temp are negativ.e
For winter of Changping, p value of Rain is large than 0.05, so Rain is not significant, this situation matches reality, because It hardly rains in winter of Beijing. the coefficients of wd are large positive number, but Temp Rain and pres have negative coefficients. --Mary
oslm_ds_spr = smf.ols(formula=formula, data=ds_spr.iloc[:,4:32]).fit().summary()
oslm_ds_su = smf.ols(formula=formula, data=ds_su.iloc[:,4:32]).fit().summary()
oslm_ds_fa = smf.ols(formula=formula, data=ds_fa.iloc[:,4:32]).fit().summary()
oslm_ds_win = smf.ols(formula=formula, data=ds_win.iloc[:,4:32]).fit().summary()
print(oslm_ds_spr) #Dongsi, Spring
print(oslm_ds_su) #Dongsi, Summer
print(oslm_ds_fa) #Dongsi, Fall
print(oslm_ds_win) #Dongsi, Winter
From the OLS Regression Results of Dongsi we can see that winter has highest R-squared which is 0.787, summer has lowest -- 0.487.
For spring of Dongsi, the p values of wd and pres are large than 0.05, so wd and pres are not significant. The coefficient of Rain and Temp are negative.
For summer of Dongsi, the p values of wd and pres are large than 0.05, so wd and pres are not significant. Rain has negative coefficients.
For Fall of Dongsi, the p value of wd is large than 0.05, so so wd is not significant. The coefficient of Rain ,pres and Temp are negative coefficients.
For Fall of Dongsi, all varaibles are significant, wd has large positive coefficient. Temp, Rain and Pres have neative coefficients. --Mary
oslm_hr_spr = smf.ols(formula=formula, data=hr_spr.iloc[:,4:32]).fit().summary()
oslm_hr_su = smf.ols(formula=formula, data=hr_su.iloc[:,4:32]).fit().summary()
oslm_hr_fa = smf.ols(formula=formula, data=hr_fa.iloc[:,4:32]).fit().summary()
oslm_hr_win = smf.ols(formula=formula, data=hr_win.iloc[:,4:32]).fit().summary()
print(oslm_hr_spr) #Huairou, Spring
print(oslm_hr_su) #Huairou, Summer
print(oslm_hr_fa) #Huairou, Fall
print(oslm_hr_win) #Huairou, Winter
From the OLS Regression Results of Huairou we can see that winter has highest R-squared which is 0.807, summer has lowest -- 0.572
For spring of Huairou, the p values of wd are large than 0.05, so wd is not significant. SO2, Temp, pres and Rain have negative coefficients.
For summer of Huairou, Rain is not significant because p value is large than 0.05. wd has large negative coefficient.
For Fall of Huairou, wd and pres are not significant because p values are large than 0.05. SO2,Temp,Rain have negative coefficients.
For winter of Huairou, Rain is not significant because p value is large than 0.05,wd has large positive coefficient. Temp and pres have negative coefficients. --Mary
For all the three different Districts in Beijing, R-squared shows seasonal trend matches PM2.5 trend which is high in winter and low in summer.
Next, we will try Gradient Boosting (GBM) and see which variables are more important for prediction of PM2.5.
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import classification_report
from sklearn.model_selection import learning_curve, GridSearchCV
This is how we tune the parameters which we will be using in the below GradientBoostingRegressor.
(We will just use Changping's Spring data for fast tuning)
First, we tune learning_rate and n_estimators:
p_test1 = {'learning_rate': [0.01, 0.03, 0.05, 0.1], 'n_estimators':[10000,30000, 50000]}
tuning = GridSearchCV(estimator=GradientBoostingRegressor(max_depth=4, min_samples_split=2, min_samples_leaf=1, subsample=1, max_features='sqrt', random_state=1), param_grid=p_test1, scoring='r2', iid=False, cv=5)
tuning.fit(X_train_spr, y_train_spr)
tuning.best_params_, tuning.best_score_
CV gives us the best parameters learning_rate=0.01 and n_estimators=10000.
Next, we tune max_depth:
p_test2 = {'max_depth':[2,4,6,8,10,12] }
tuning = GridSearchCV(estimator=GradientBoostingRegressor(learning_rate=0.01, n_estimators=10000, min_samples_split=2, min_samples_leaf=1, subsample=1, max_features='sqrt', random_state=1), param_grid=p_test2, scoring='r2', iid=False, cv=5)
tuning.fit(X_train_spr, y_train_spr)
tuning.best_params_, tuning.best_score_
CV gives us the best parameters max_depth=10.
Then, we tune min_samples_split and min_samples_leaf:
p_test3 = {'min_samples_split':[2,4,6,8], 'min_samples_leaf':[1,3,5,7]}
tuning = GridSearchCV(estimator=GradientBoostingRegressor(learning_rate=0.01, n_estimators=10000, max_depth=10, subsample=1, max_features='sqrt', random_state=1), param_grid=p_test3, scoring='r2', iid=False, cv=5)
tuning.fit(X_train_spr, y_train_spr)
tuning.best_params_, tuning.best_score_
CV gives us the best parameters min_samples_split=2 and min_samples_leaf=5 and yields $R^2=0.8319670932863257$.
# For Changping, Spring
X_train_spr, X_test_spr, y_train_spr, y_test_spr = train_test_split(
spring.loc['Changping'].iloc[:,6:31], # note we are excluding PM10
spring.loc['Changping'].iloc[:,4], # 4th column is PM2.5
test_size = 1/3,
random_state=1)
y_train_spr = y_train_spr.ravel()
y_test_spr = y_test_spr.ravel()
model_gbm = GradientBoostingRegressor(learning_rate=0.01,
n_estimators=10000,
max_depth=10,
min_samples_split=2,
min_samples_leaf=5,
subsample=1,
max_features=6,
random_state=1)
model_gbm.fit(X_train_spr, y_train_spr)
predictors=list(X_train_spr)
feat_imp = pd.Series(model_gbm.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features for Changping, Spring')
plt.ylabel('Feature Importance Score')
chp_spr_r2 = model_gbm.score(X_test_spr, y_test_spr)
print(chp_spr_r2) #the r^2 for this model
chp_spr_fi = feat_imp
print(chp_spr_fi) #the important features
# For Changping, Summer
X_train_su, X_test_su, y_train_su, y_test_su = train_test_split(
summer.loc['Changping'].iloc[:,6:31], # note we are excluding PM10
summer.loc['Changping'].iloc[:,4], # 4th column is PM2.5
test_size = 1/3,
random_state=1)
y_train_su = y_train_su.ravel()
y_test_su = y_test_su.ravel()
model_gbm = GradientBoostingRegressor(learning_rate=0.01,
n_estimators=10000,
max_depth=10,
min_samples_split=2,
min_samples_leaf=5,
subsample=1,
max_features=6,
random_state=1)
model_gbm.fit(X_train_su, y_train_su)
predictors=list(X_train_su)
feat_imp = pd.Series(model_gbm.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features for Changping, Summer')
plt.ylabel('Feature Importance Score')
chp_su_r2 = model_gbm.score(X_test_su, y_test_su)
print(chp_su_r2) #Returns the coefficient of determination R^2 of the prediction
chp_su_fi = feat_imp
print(chp_su_fi) #the important features
# For Changping, Fall
X_train_fa, X_test_fa, y_train_fa, y_test_fa = train_test_split(
fall.loc['Changping'].iloc[:,6:31], # note we are excluding PM10
fall.loc['Changping'].iloc[:,4], # 4th column is PM2.5
test_size = 1/3,
random_state=1)
y_train_fa = y_train_fa.ravel()
y_test_fa = y_test_fa.ravel()
model_gbm = GradientBoostingRegressor(learning_rate=0.01,
n_estimators=10000,
max_depth=10,
min_samples_split=2,
min_samples_leaf=5,
subsample=1,
max_features=6,
random_state=1)
model_gbm.fit(X_train_fa, y_train_fa)
predictors=list(X_train_fa)
feat_imp = pd.Series(model_gbm.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features for Changping, Fall')
plt.ylabel('Feature Importance Score')
chp_fa_r2 = model_gbm.score(X_test_fa, y_test_fa)
print(chp_fa_r2) #the r^2 for this model
chp_fa_fi = feat_imp
print(chp_fa_fi) #the important features
# For Changping, Winter
X_train_win, X_test_win, y_train_win, y_test_win = train_test_split(
winter.loc['Changping'].iloc[:,6:31], # note we are excluding PM10
winter.loc['Changping'].iloc[:,4], # 4th column is PM2.5
test_size = 1/3,
random_state=1)
y_train_win = y_train_win.ravel()
y_test_win = y_test_win.ravel()
model_gbm = GradientBoostingRegressor(learning_rate=0.01,
n_estimators=10000,
max_depth=10,
min_samples_split=2,
min_samples_leaf=5,
subsample=1,
max_features=6,
random_state=1)
model_gbm.fit(X_train_win, y_train_win)
predictors=list(X_train_win)
feat_imp = pd.Series(model_gbm.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features for Changping, Winter')
plt.ylabel('Feature Importance Score')
chp_win_r2 = model_gbm.score(X_test_win, y_test_win)
print(chp_win_r2) #the r^2 for this model
chp_win_fi = feat_imp
print(chp_win_fi) #the important features
gbm for the four seasons of Dongsi.¶# For Dongsi, Spring
X_train_spr, X_test_spr, y_train_spr, y_test_spr = train_test_split(
spring.loc['Dongsi'].iloc[:,6:31], # note we are excluding PM10
spring.loc['Dongsi'].iloc[:,4], # 4th column is PM2.5
test_size = 1/3,
random_state=1)
y_train_spr = y_train_spr.ravel()
y_test_spr = y_test_spr.ravel()
model_gbm = GradientBoostingRegressor(learning_rate=0.01,
n_estimators=10000,
max_depth=10,
min_samples_split=2,
min_samples_leaf=5,
subsample=1,
max_features=6,
random_state=1)
model_gbm.fit(X_train_spr, y_train_spr)
predictors=list(X_train_spr)
feat_imp = pd.Series(model_gbm.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features for Dongsi, Spring')
plt.ylabel('Feature Importance Score')
ds_spr_r2 = model_gbm.score(X_test_spr, y_test_spr)
print(ds_spr_r2) #the r^2 for this model
ds_spr_fi = feat_imp
print(ds_spr_fi) #the important features
# For Dongsi, Summer
X_train_su, X_test_su, y_train_su, y_test_su = train_test_split(
summer.loc['Dongsi'].iloc[:,6:31], # note we are excluding PM10
summer.loc['Dongsi'].iloc[:,4], # 4th column is PM2.5
test_size = 1/3,
random_state=1)
y_train_su = y_train_su.ravel()
y_test_su = y_test_su.ravel()
model_gbm = GradientBoostingRegressor(learning_rate=0.01,
n_estimators=10000,
max_depth=10,
min_samples_split=2,
min_samples_leaf=5,
subsample=1,
max_features=6,
random_state=1)
model_gbm.fit(X_train_su, y_train_su)
predictors=list(X_train_su)
feat_imp = pd.Series(model_gbm.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features for Dongsi, Summer')
plt.ylabel('Feature Importance Score')
ds_su_r2 = model_gbm.score(X_test_su, y_test_su)
print(ds_su_r2) #Returns the coefficient of determination R^2 of the prediction
ds_su_fi = feat_imp
print(ds_su_fi) #the important features
# For Dongsi, Fall
X_train_fa, X_test_fa, y_train_fa, y_test_fa = train_test_split(
fall.loc['Dongsi'].iloc[:,6:31], # note we are excluding PM10
fall.loc['Dongsi'].iloc[:,4], # 4th column is PM2.5
test_size = 1/3,
random_state=1)
y_train_fa = y_train_fa.ravel()
y_test_fa = y_test_fa.ravel()
model_gbm = GradientBoostingRegressor(learning_rate=0.01,
n_estimators=10000,
max_depth=10,
min_samples_split=2,
min_samples_leaf=5,
subsample=1,
max_features=6,
random_state=1)
model_gbm.fit(X_train_fa, y_train_fa)
predictors=list(X_train_fa)
feat_imp = pd.Series(model_gbm.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features for Dongsi, Fall')
plt.ylabel('Feature Importance Score')
ds_fa_r2 = model_gbm.score(X_test_fa, y_test_fa)
print(ds_fa_r2) #the r^2 for this model
ds_fa_fi = feat_imp
print(ds_fa_fi) #the important features
# For Changping, Winter
X_train_win, X_test_win, y_train_win, y_test_win = train_test_split(
winter.loc['Dongsi'].iloc[:,6:31], # note we are excluding PM10
winter.loc['Dongsi'].iloc[:,4], # 4th column is PM2.5
test_size = 1/3,
random_state=1)
y_train_win = y_train_win.ravel()
y_test_win = y_test_win.ravel()
model_gbm = GradientBoostingRegressor(learning_rate=0.01,
n_estimators=10000,
max_depth=10,
min_samples_split=2,
min_samples_leaf=5,
subsample=1,
max_features=6,
random_state=1)
model_gbm.fit(X_train_win, y_train_win)
predictors=list(X_train_win)
feat_imp = pd.Series(model_gbm.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features for Dongsi, Winter')
plt.ylabel('Feature Importance Score')
ds_win_r2 = model_gbm.score(X_test_win, y_test_win)
print(ds_win_r2) #the r^2 for this model
ds_win_fi = feat_imp
print(ds_win_fi) #the important features
gbm for the four seasons of Huairou¶# For Huairou, Spring
X_train_spr, X_test_spr, y_train_spr, y_test_spr = train_test_split(
spring.loc['Huairou'].iloc[:,6:31], # note we are excluding PM10
spring.loc['Huairou'].iloc[:,4], # 4th column is PM2.5
test_size = 1/3,
random_state=1)
y_train_spr = y_train_spr.ravel()
y_test_spr = y_test_spr.ravel()
model_gbm = GradientBoostingRegressor(learning_rate=0.01,
n_estimators=10000,
max_depth=10,
min_samples_split=2,
min_samples_leaf=5,
subsample=1,
max_features=6,
random_state=1)
model_gbm.fit(X_train_spr, y_train_spr)
predictors=list(X_train_spr)
feat_imp = pd.Series(model_gbm.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features for Huairou, Spring')
plt.ylabel('Feature Importance Score')
hr_spr_r2 = model_gbm.score(X_test_spr, y_test_spr)
print(hr_spr_r2) #the r^2 for this model
hr_spr_fi = feat_imp
print(hr_spr_fi) #the important features
# For Huairou, Summer
X_train_su, X_test_su, y_train_su, y_test_su = train_test_split(
summer.loc['Huairou'].iloc[:,6:31], # note we are excluding PM10
summer.loc['Huairou'].iloc[:,4], # 4th column is PM2.5
test_size = 1/3,
random_state=1)
y_train_su = y_train_su.ravel()
y_test_su = y_test_su.ravel()
model_gbm = GradientBoostingRegressor(learning_rate=0.01,
n_estimators=10000,
max_depth=10,
min_samples_split=2,
min_samples_leaf=5,
subsample=1,
max_features=6,
random_state=1)
model_gbm.fit(X_train_su, y_train_su)
predictors=list(X_train_su)
feat_imp = pd.Series(model_gbm.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features for Huairou, Summer')
plt.ylabel('Feature Importance Score')
hr_su_r2 = model_gbm.score(X_test_su, y_test_su)
print(hr_su_r2) #Returns the coefficient of determination R^2 of the prediction
hr_su_fi = feat_imp
print(hr_su_fi) #the important features
# For Huairou, Fall
X_train_fa, X_test_fa, y_train_fa, y_test_fa = train_test_split(
fall.loc['Huairou'].iloc[:,6:31], # note we are excluding PM10
fall.loc['Huairou'].iloc[:,4], # 4th column is PM2.5
test_size = 1/3,
random_state=1)
y_train_fa = y_train_fa.ravel()
y_test_fa = y_test_fa.ravel()
model_gbm = GradientBoostingRegressor(learning_rate=0.01,
n_estimators=10000,
max_depth=10,
min_samples_split=2,
min_samples_leaf=5,
subsample=1,
max_features=6,
random_state=1)
model_gbm.fit(X_train_fa, y_train_fa)
predictors=list(X_train_fa)
feat_imp = pd.Series(model_gbm.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features for Huairou, Fall')
plt.ylabel('Feature Importance Score')
hr_fa_r2 = model_gbm.score(X_test_fa, y_test_fa)
print(hr_fa_r2) #the r^2 for this model
hr_fa_fi = feat_imp
print(hr_fa_fi) #the important features
# For Huairou, Winter
X_train_win, X_test_win, y_train_win, y_test_win = train_test_split(
winter.loc['Huairou'].iloc[:,6:31], # note we are excluding PM10
winter.loc['Huairou'].iloc[:,4], # 4th column is PM2.5
test_size = 1/3,
random_state=1)
y_train_win = y_train_win.ravel()
y_test_win = y_test_win.ravel()
model_gbm = GradientBoostingRegressor(learning_rate=0.01,
n_estimators=10000,
max_depth=10,
min_samples_split=2,
min_samples_leaf=5,
subsample=1,
max_features=6,
random_state=1)
model_gbm.fit(X_train_win, y_train_win)
predictors=list(X_train_win)
feat_imp = pd.Series(model_gbm.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features for Huairou, Winter')
plt.ylabel('Feature Importance Score')
hr_win_r2 = model_gbm.score(X_test_win, y_test_win)
print(hr_win_r2) #the r^2 for this model
hr_win_fi = feat_imp
print(hr_win_fi) #the important features
# changping
columns=['Spring', 'Summer', 'Fall', 'Winter']
changping_fi = pd.DataFrame([chp_spr_fi.sort_index(), chp_su_fi.sort_index(), chp_fa_fi.sort_index(), chp_win_fi.sort_index()], index=columns).T
print(changping_fi)
changping_fi.plot(kind='bar',title='Feature Importance for Changping')
# Dongsi
dongsi_fi = pd.DataFrame([ds_spr_fi.sort_index(), ds_su_fi.sort_index(), ds_fa_fi.sort_index(), ds_win_fi.sort_index()], index=columns).T
print(dongsi_fi)
dongsi_fi.plot(kind='bar',title='Feature Importance for Dongsi')
# Huairou
huairou_fi = pd.DataFrame([hr_spr_fi.sort_index(), hr_su_fi.sort_index(), hr_fa_fi.sort_index(), hr_win_fi.sort_index()], index=columns).T
print(huairou_fi)
dongsi_fi.plot(kind='bar',title='Feature Importance for Huairou')
[chp_spr_r2, chp_su_r2, chp_fa_r2, chp_win_r2]
d = {'Changping':[chp_spr_r2, chp_su_r2, chp_fa_r2, chp_win_r2],
'Dongsi':[ds_spr_r2, ds_su_r2, ds_fa_r2, ds_win_r2],
'Huairou':[hr_spr_r2, hr_su_r2, hr_fa_r2, hr_win_r2]}
pd.DataFrame(data=d,index=columns)
Carbon monoxide is a colorless, odorless, and tasteless flammable gas that is slightly less dense than air.Carbon monoxide consists of one carbon atom and one oxygen atom and it is one of the main atmospheric pollutants.
x=data[["CO"]]
y=data[["PM2.5"]]
plt.scatter(x,y)
plt.title("CO and PM2.5")
plt.ylabel("PM2.5")
plt.xlabel("CO")
plt.show()
CO and PM2.5 has positive relationship
The atmosphere is formed by air and surrounds the Earth, this column measured what’s the weight of air column on each square metre of the Earth's surface corresponding to a pressure ,at sea level is 1013hPa
x=data[["PRES"]]
y=data[["PM2.5"]]
plt.scatter(x,y,color="red")
plt.title("PRES and PM2.5")
plt.ylabel("PM2.5")
plt.xlabel("PRES")
plt.show()
PRES and PM2.5 shows like normal distribution, with pressure increase or decrease, PM2.5 will decrease.
Rain is liquid water in the form of droplets that have condensed from atmospheric water vapor and then become heavy enough to fall under gravity.
x=data[["RAIN"]]
y=data[["PM2.5"]]
plt.scatter(x,y,color="green")
plt.title("RAIN and PM2.5")
plt.ylabel("PM2.5")
plt.xlabel("RAIN")
plt.show()
With rain increase, PM2.5 decrease.